A Netflix Analysis
Installing pacman
install.packages("pacman")
Error in install.packages : Updating loaded packages
library(pacman)
Loading the packages needed
pacman::p_load(dplyr,tidyverse,tidyr, janitor, lubridate, ggplot2, leaflet, plotly, readxl)
Loading our dataset
netflix= read.csv("netflix_titles.csv")
Preview of our dataset
head(netflix, 5)
Lets check the column names
names(netflix)
[1] "show_id" "type" "title" "director" "cast" "country" "date_added"
[8] "release_year" "rating" "duration" "listed_in" "description" "X" "X.1"
[15] "X.2" "X.3" "X.4" "X.5" "X.6" "X.7" "X.8"
[22] "X.9" "X.10" "X.11" "X.12" "X.13"
Lets look at each column and its datatype
str(netflix)
'data.frame': 8809 obs. of 26 variables:
$ show_id : chr "s1" "s2" "s3" "s4" ...
$ type : chr "Movie" "TV Show" "TV Show" "TV Show" ...
$ title : chr "Dick Johnson Is Dead" "Blood & Water" "Ganglands" "Jailbirds New Orleans" ...
$ director : chr "Kirsten Johnson" "" "Julien Leclercq" "" ...
$ cast : chr "" "Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile "| __truncated__ "Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, G"| __truncated__ "" ...
$ country : chr "United States" "South Africa" "" "" ...
$ date_added : chr "September 25, 2021" "September 24, 2021" "September 24, 2021" "September 24, 2021" ...
$ release_year: int 2020 2021 2021 2021 2021 2021 2021 1993 2021 2021 ...
$ rating : chr "PG-13" "TV-MA" "TV-MA" "TV-MA" ...
$ duration : chr "90 min" "2 Seasons" "1 Season" "1 Season" ...
$ listed_in : chr "Documentaries" "International TV Shows, TV Dramas, TV Mysteries" "Crime TV Shows, International TV Shows, TV Action & Adventure" "Docuseries, Reality TV" ...
$ description : chr "As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical wa"| __truncated__ "After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is h"| __truncated__ "To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled "| __truncated__ "Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Or"| __truncated__ ...
$ X : logi NA NA NA NA NA NA ...
$ X.1 : logi NA NA NA NA NA NA ...
$ X.2 : logi NA NA NA NA NA NA ...
$ X.3 : logi NA NA NA NA NA NA ...
$ X.4 : logi NA NA NA NA NA NA ...
$ X.5 : logi NA NA NA NA NA NA ...
$ X.6 : logi NA NA NA NA NA NA ...
$ X.7 : logi NA NA NA NA NA NA ...
$ X.8 : logi NA NA NA NA NA NA ...
$ X.9 : logi NA NA NA NA NA NA ...
$ X.10 : logi NA NA NA NA NA NA ...
$ X.11 : logi NA NA NA NA NA NA ...
$ X.12 : logi NA NA NA NA NA NA ...
$ X.13 : logi NA NA NA NA NA NA ...
Lets convert the type to factor
netflix$type= as.factor(netflix$type)
Investigating Null values
NAs=colSums(is.na(netflix))
names(netflix)[NAs>0]
[1] "X" "X.1" "X.2" "X.3" "X.4" "X.5" "X.6" "X.7" "X.8" "X.9" "X.10" "X.11" "X.12" "X.13"
Dim
dim(netflix)
[1] 8809 26
Total rows with NAs in each column
colSums(is.na(netflix))
show_id type title director cast country date_added release_year
0 0 0 0 0 0 0 0
rating duration listed_in description X X.1 X.2 X.3
0 0 0 0 8809 8809 8809 8809
X.4 X.5 X.6 X.7 X.8 X.9 X.10 X.11
8809 8809 8809 8809 8809 8809 8809 8809
X.12 X.13
8809 8809
Since the columns with NAs dont have much meaning to the dataset we
can remove them
netflix= netflix %>%
select(show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description)
Descriptive Statistics:
Summarize the distribution of the types of shows (Movies vs. TV
Shows).
#Distribution of movie type
netflix_type= netflix %>%
select(type) %>%
group_by(type) %>%
summarise(totalcount= n())
# Calculate percentage labels
percentages <- round(netflix_type$totalcount / sum(netflix_type$totalcount) * 100, 1)
labels <- paste(netflix_type$type, percentages, "%", sep = " ")
# Create the pie chart using plotly
fig <- plot_ly(netflix_type, labels = ~type, values = ~totalcount, type = 'pie',
textinfo = 'label+percent',
insidetextorientation = 'radial',
marker = list(colors = c('red', 'green')))
# Customize the layout
fig <- fig %>% layout(title = 'Distribution of TV Shows and Movies')
# Display the plot
fig
Calculate the number of shows released per year.
netflix_releaseyear= netflix %>%
select(release_year, type) %>%
group_by(release_year, type) %>%
summarise(total_moveis_or_shows= n(), .groups = "drop")
netflix_release_year= netflix %>%
select(release_year, type) %>%
group_by(release_year) %>%
summarise(total_shows= n()) %>%
arrange(desc(total_shows)) %>%
head(10)
fig4=plot_ly(netflix_release_year, x= ~release_year, y= ~total_shows,type= 'bar',
text= ~total_shows,
textposition = "auto",
marker=list(color="green")) %>%
layout(title= "Top 10 years with the highest number of shows produced",
xaxis= list(title="Years"),
yaxis= list(title="Total Shows"))
fig4
Analyze the distribution of ratings (e.g., TV-MA, PG-13, etc.).
netflix_ratings= netflix %>%
select(rating) %>%
group_by(rating) %>%
summarize(total_rating=n())
#the plot
fig3 =plot_ly(netflix_ratings, x= ~rating, y= ~total_rating, type='bar', marker=list(color= "red"))%>%
layout(title="Distribution of the ratings",
xaxis= list(title="Ratings"),
yaxis= list(title="Total ratings"))
fig3
Trend Analysis:
Analyze the popularity of different genres over the years.
netflix_genre= netflix %>%
select(release_year, listed_in) %>%
group_by(release_year, listed_in) %>%
summarise(total_shows= n(), .groups = "drop")
Genre Analysis:
Determine the most common genres listed.
netflix_genre_common= netflix %>%
select(listed_in) %>%
group_by(listed_in) %>%
summarise(total_count= n()) %>%
arrange(desc(total_count)) %>%
head(10)
fig5 = plot_ly(netflix_genre_common, y= ~listed_in, x= ~total_count, type= 'bar',
text= ~total_count,
textposition= "auto",
marker= list(color="orange")) %>%
layout(
title= "Top 10 Most common genres Listed",
yaxis = list(title="Genres"),
xaxis = list(title="Total Count")
)
fig5
Analyze the correlation between genres and ratings.
netflix_corr_genre= netflix %>%
select(listed_in, rating) %>%
group_by(listed_in, rating) %>%
summarise(total_count = n(), .groups = "drop")
print(netflix_corr_genre)
Country Analysis:
Analyze the Countries With the most movies produced
netflix_country= netflix %>%
select(country) %>%
mutate(country= paste0(country, ","))
netflix_country = netflix_country %>%
separate(col= country, into= c("Country", "Rest"), sep=",")
netflix_country_grouped= netflix_country %>%
group_by(Country) %>%
summarise(Total_Movies= n()) %>%
arrange(desc(Total_Movies))
# Convert empty strings to NA in the 'Country' column
netflix_country_grouped_clean <- netflix_country_grouped %>%
mutate(Country = na_if(Country, ""))
# Remove rows with NA values
netflix_country_grouped_clean <- na.omit(netflix_country_grouped_clean)
netflix_country_grouped_plot= head(netflix_country_grouped, 10)
fig8 = plot_ly(netflix_country_grouped_plot, x= ~ Country, y= ~Total_Movies, type= 'bar',
text= ~Total_Movies,
markers=list(color="yellow")) %>%
layout(
title = "Top 10 Countries with the Highest No of movies Produced",
xaxis = list(title= "Countries", tickangle= -45),
yaxis = list(title= "Total Movies Produced"))
fig8
NA
Map
# # Get world map data for country coordinates
# world_map <- map_data("world")
#
# # Prepare the data by merging with coordinates
# country_coords <- world_map %>%
# group_by(region) %>%
# summarize(
# lat = mean(lat),
# lng = mean(long)
# ) %>%
# rename(Country = region)
#
# # Merge country data with coordinates
# map_data <- netflix_country_grouped_clean %>%
# left_join(country_coords, by = "Country")
#
# # Filter out rows with missing or invalid coordinates
# map_data_filtered <- map_data %>%
# filter(!is.na(lat) & !is.na(lng))
#
# # Create an interactive map with markers
# m <- leaflet(map_data_filtered) %>%
# addTiles() %>%
# addMarkers(
# clusterOptions = markerClusterOptions(),
# ~lng, ~lat,
# popup = ~paste("<strong>Country:</strong>", Country, "<br>",
# "<strong>Value:</strong>", Total_Movies)
# )
#
# # Display the map
# m
Analyze the diversity of content by country.
United States
netflix_diversity= netflix %>%
select(country, listed_in) %>%
group_by(country, listed_in) %>%
summarise(total_count=n(), .groups = "drop") %>%
arrange(desc(total_count))
print(netflix_diversity)
Duration Analysis:
Compare the average duration of movies vs. TV shows.
netflix$duration= as.character(netflix$duration)
netflix_duration <- netflix %>%
select(type, duration) %>%
separate(col = duration, into = c("duration", "units"), sep = " ")
# Convert duration back to integer
netflix_duration$duration <- as.integer(netflix_duration$duration)
netflix_duration_compison_type= netflix_duration %>%
group_by(type) %>%
summarise(Average_Duration= floor(mean(duration, na.rm = TRUE)))
print(netflix_duration_compison_type)
Analyze the distribution of the number of seasons for TV shows.
netflix_tvshows_distribution= netflix %>%
select(type, duration) %>%
filter(type == "TV Show") %>%
group_by(duration) %>%
summarise(Frequency_totals= n()) %>%
arrange(desc(Frequency_totals))
netflix_tvshows_distribution$duration= factor(netflix_tvshows_distribution$duration, levels= unique(netflix_tvshows_distribution$duration))
# Create a bar chart
fig6 = plot_ly(netflix_tvshows_distribution, x = ~duration, y= ~Frequency_totals, type = 'bar',
marker=list(color="green")) %>%
layout(
title= "Distribution of Seasons in the TV Shows",
xaxis= list(title ="Duration", tickangle=-45),
yaxis= list(title= "Frequency")
)
# Display the plot
fig6
---
title: "R Notebook"
output: html_notebook
---

# A Netflix Analysis
#### Installing pacman
```{r, warning = FALSE}
install.packages("pacman")
library(pacman)
```
#### Loading the packages needed
```{r}
pacman::p_load(dplyr,tidyverse,tidyr,  janitor, lubridate, ggplot2, leaflet, plotly, readxl)
```
#### Loading our dataset
```{r}
netflix= read.csv("netflix_titles.csv")
```
#### Preview of our dataset
```{r}
head(netflix, 5)
```
#### Lets check the column names
```{r}
names(netflix)
```
#### Lets look at each column and its datatype
```{r}
str(netflix)
```
#### Lets convert the type to factor
```{r}
netflix$type= as.factor(netflix$type)
```

#### Investigating Null values
```{r}
NAs=colSums(is.na(netflix))
names(netflix)[NAs>0]
```
#### Dim
```{r}
dim(netflix)
```
#### Total rows with NAs in each column
```{r}
colSums(is.na(netflix))
```

#### Since the columns with NAs dont have much meaning to the dataset we can remove them
```{r}
netflix= netflix %>% 
  select(show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description)
```

## Descriptive Statistics:
### Summarize the distribution of the types of shows (Movies vs. TV Shows).
```{r}
#Distribution of movie type
netflix_type= netflix %>% 
  select(type) %>% 
  group_by(type) %>% 
  summarise(totalcount= n())

# Calculate percentage labels
percentages <- round(netflix_type$totalcount / sum(netflix_type$totalcount) * 100, 1)
labels <- paste(netflix_type$type, percentages, "%", sep = " ")

# Create the pie chart using plotly
fig <- plot_ly(netflix_type, labels = ~type, values = ~totalcount, type = 'pie', 
               textinfo = 'label+percent',
               insidetextorientation = 'radial',
               marker = list(colors = c('red', 'green')))

# Customize the layout
fig <- fig %>% layout(title = 'Distribution of TV Shows and Movies')

# Display the plot
fig
```
### Calculate the number of shows released per year.
```{r}
netflix_releaseyear= netflix %>% 
  select(release_year, type) %>% 
  group_by(release_year, type) %>% 
  summarise(total_moveis_or_shows= n(), .groups = "drop")

netflix_release_year= netflix %>% 
  select(release_year, type) %>% 
  group_by(release_year) %>% 
  summarise(total_shows= n()) %>% 
  arrange(desc(total_shows)) %>% 
  head(10)

fig4=plot_ly(netflix_release_year, x= ~release_year, y= ~total_shows,type= 'bar',
             text= ~total_shows,
             textposition = "auto",
             marker=list(color="green")) %>% 
  layout(title= "Top 10 years with the highest number of shows produced",
         xaxis= list(title="Years"),
         yaxis= list(title="Total Shows"))
fig4
```

### Analyze the distribution of ratings (e.g., TV-MA, PG-13, etc.).
```{r}
netflix_ratings= netflix %>%
  select(rating) %>% 
  group_by(rating) %>% 
  summarize(total_rating=n())

#the plot
fig3 =plot_ly(netflix_ratings, x= ~rating, y= ~total_rating, type='bar', marker=list(color= "red"))%>% 
  layout(title="Distribution of the ratings",
         xaxis= list(title="Ratings"),
         yaxis= list(title="Total ratings"))

fig3
```
## Trend Analysis:
### Explore the trends in the number of shows added to the platform over time.
```{r}
netflix_release_year_group= netflix %>% 
  select(release_year, type) %>% 
  group_by(release_year) %>% 
  summarise(total_shows= n())

netflix_release_year_grouped= netflix_release_year_group %>% 
  mutate(release_year_grouped= case_when(
    release_year < 1930 ~ "1920-1930",
    release_year < 1940 ~ "1930-1940",
    release_year < 1950 ~ "1940-1950",
    release_year < 1960 ~ "1950-1960",
    release_year < 1970 ~ "1950-1970",
    release_year < 1980 ~ "1970-1980",
    release_year < 1990 ~ "1980-1990",
    release_year < 2000 ~ "1990-2000",
    release_year < 2010 ~ "2000-2010",
    release_year < 2020 ~ "2010-2020",
    release_year >= 2020 ~ "2020")) %>% 
  group_by(release_year_grouped) %>% 
  summarise(total_shows = sum(total_shows))

#using plotly to draw the line Chart
fig1 <- plot_ly(netflix_release_year_grouped, x = ~release_year_grouped, y = ~total_shows, type = 'scatter', mode = 'lines') %>%
  layout(title = 'Netflix Shows by Release Year',
         xaxis = list(title = 'Release Year', tickangle= -45),
         yaxis = list(title = 'Total Shows'))

#printing the plot
fig1
```

### Analyze the popularity of different genres over the years.
```{r}
netflix_genre= netflix %>% 
  select(release_year, listed_in) %>% 
  group_by(release_year, listed_in) %>% 
  summarise(total_shows= n(), .groups = "drop")
```
## Genre Analysis:
### Determine the most common genres listed.
```{r}
netflix_genre_common= netflix %>% 
  select(listed_in) %>% 
  group_by(listed_in) %>% 
  summarise(total_count= n()) %>% 
  arrange(desc(total_count)) %>% 
  head(10)

fig5 = plot_ly(netflix_genre_common, y= ~listed_in, x= ~total_count, type= 'bar',
               text= ~total_count,
               textposition= "auto",
               marker= list(color="orange")) %>% 
  layout(
    title= "Top 10 Most common genres Listed",
    yaxis = list(title="Genres"),
    xaxis = list(title="Total Count")
  )

fig5
```

### Analyze the correlation between genres and ratings.
```{r}
netflix_corr_genre= netflix %>% 
  select(listed_in, rating) %>% 
  group_by(listed_in, rating) %>% 
  summarise(total_count = n(), .groups = "drop")
 print(netflix_corr_genre)
```

## Country Analysis:
### Analyze the Countries With the most movies produced
```{r, warning=FALSE}
netflix_country= netflix %>% 
  select(country) %>% 
  mutate(country= paste0(country, ","))

netflix_country = netflix_country %>% 
  separate(col= country, into= c("Country", "Rest"), sep=",")

netflix_country_grouped= netflix_country %>% 
  group_by(Country) %>% 
  summarise(Total_Movies= n()) %>% 
  arrange(desc(Total_Movies))
# Convert empty strings to NA in the 'Country' column
netflix_country_grouped_clean <- netflix_country_grouped %>%
  mutate(Country = na_if(Country, ""))

# Remove rows with NA values
netflix_country_grouped_clean <- na.omit(netflix_country_grouped_clean)
netflix_country_grouped_plot= head(netflix_country_grouped, 10)

fig8 = plot_ly(netflix_country_grouped_plot, x= ~ Country, y= ~Total_Movies, type= 'bar',
               text= ~Total_Movies,
               markers=list(color="yellow")) %>% 
  layout(
    title = "Top 10 Countries with the Highest No of movies Produced",
    xaxis = list(title= "Countries", tickangle= -45),
    yaxis = list(title= "Total Movies Produced"))
 fig8 

```
# Map
```{r}
# # Get world map data for country coordinates
# world_map <- map_data("world")
# 
# # Prepare the data by merging with coordinates
# country_coords <- world_map %>%
#   group_by(region) %>%
#   summarize(
#     lat = mean(lat),
#     lng = mean(long)
#   ) %>%
#   rename(Country = region)
# 
# # Merge country data with coordinates
# map_data <- netflix_country_grouped_clean %>%
#   left_join(country_coords, by = "Country")
# 
# # Filter out rows with missing or invalid coordinates
# map_data_filtered <- map_data %>%
#   filter(!is.na(lat) & !is.na(lng))
# 
# # Create an interactive map with markers
# m <- leaflet(map_data_filtered) %>%
#   addTiles() %>%
#   addMarkers(
#     clusterOptions = markerClusterOptions(),
#     ~lng, ~lat,
#     popup = ~paste("<strong>Country:</strong>", Country, "<br>",
#                    "<strong>Value:</strong>", Total_Movies)
#   )
# 
# # Display the map
# m

```
### Analyze the diversity of content by country.
#### United States
```{r}
netflix_diversity= netflix %>% 
  select(country, listed_in) %>% 
  group_by(country, listed_in) %>% 
  summarise(total_count=n(), .groups = "drop") %>% 
  arrange(desc(total_count))
print(netflix_diversity)
```
## Duration Analysis:
### Compare the average duration of movies vs. TV shows.
```{r, warning=FALSE}
netflix$duration= as.character(netflix$duration)

netflix_duration <- netflix %>%
  select(type, duration) %>%
  separate(col = duration, into = c("duration", "units"), sep = " ")

# Convert duration back to integer
netflix_duration$duration <- as.integer(netflix_duration$duration)

netflix_duration_compison_type= netflix_duration %>% 
  group_by(type) %>%
  summarise(Average_Duration= floor(mean(duration, na.rm = TRUE)))

print(netflix_duration_compison_type)
```
### Analyze the distribution of the number of seasons for TV shows.
```{r}
netflix_tvshows_distribution= netflix %>% 
  select(type, duration) %>% 
  filter(type == "TV Show") %>% 
  group_by(duration) %>% 
  summarise(Frequency_totals= n()) %>% 
  arrange(desc(Frequency_totals))

netflix_tvshows_distribution$duration= factor(netflix_tvshows_distribution$duration, levels= unique(netflix_tvshows_distribution$duration))
# Create a bar chart
fig6 = plot_ly(netflix_tvshows_distribution, x = ~duration, y= ~Frequency_totals, type = 'bar',
            marker=list(color="green")) %>%
  layout(
    title= "Distribution of Seasons in the TV Shows",
    xaxis= list(title ="Duration", tickangle=-45),
    yaxis= list(title= "Frequency")
  )

# Display the plot
fig6
```
